libname red "/.../F5500/microdata";
libname blue "/.../data";
libname icf "/.../lehd_s2014_icf/2014";
libname temp "/.../data/temp";

%macro statelib(st);
	libname ecft26&st "/.../lehd_s2014_ecft26/&st";
	libname ecf&st "/.../lehd_s2014_ecf/&st";
	libname ehf&st "/.../lehd_s2014_ehf/&st";
%mend statelib;

%statelib(ak); %statelib(ar); %statelib(az); %statelib(ca); %statelib(co); 
%statelib(ct); %statelib(dc); %statelib(de); %statelib(fl); %statelib(ga); 
%statelib(hi); %statelib(ia); %statelib(id); %statelib(il); %statelib(in); 
%statelib(ks); %statelib(ky); %statelib(la); %statelib(ma); %statelib(md); 
%statelib(me); %statelib(mi); %statelib(mn); %statelib(mo); %statelib(ms);
%statelib(mt); %statelib(nc); %statelib(nd); %statelib(ne); %statelib(nh);
%statelib(nj); %statelib(nm); %statelib(nv); %statelib(ny); %statelib(oh);
%statelib(ok); %statelib(or); %statelib(pa); %statelib(ri); %statelib(sc);
%statelib(sd); %statelib(tn); %statelib(tx); %statelib(ut); %statelib(va);
%statelib(vt); %statelib(wa); %statelib(wi); %statelib(wv); %statelib(wy);

%include "/.../programs/includes/format_source_2000.sas";
%include "/.../programs/includes/format_source_2001.sas";
%include "/.../programs/includes/format_source_2002.sas";
%include "/.../programs/includes/format_source_2004.sas";
%include "/.../programs/includes/format_source_2005.sas";
%include "/.../programs/includes/format_source_2014.sas";

/**********************************************************************
* Author : Dhiren Patki
*
* This program links DB sponsoring employers to the LEHD.
* A panel of DB sponsoring firms is produced in 03_create_firmpanel.do
* This program creates a balanced of workers at DB sponsoring firms
* at the person-year-group level	
***********************************************************************/

%let states=ak ar az ca co ct dc de fl ga hi ia id il in ks ky
la ma md me mi mn mo ms mt nc nd ne nh nj nm nv ny oh ok or pa ri sc
sd tn tx ut va vt wa wi wv wy;
%let numstates=50;

/*Clean up temp directory*/
proc datasets lib=temp nolist kill;
quit;
run;

/*Pull in set of firm-years from firm panel*/
data blue.abvcut_firms;
	infile "/.../data/abvcut_firms.csv" delimiter = ',' missover dsd lrecl=/*redacted*/ firstobs=/*redacted*/; 
	informat firmid /*redacted*/;
	informat year /*redacted*/;
	informat group /*redacted*/;
	informat treat_flag /*redacted*/;
	format firmid /*redacted*/;
	format year /*redacted*/;
	format group /*redacted*/;
	format treat_flag /*redacted*/;
	input firmid $
	      year
	      group 
	      treat_flag;
run;

%macro buildpanel;

	%do i=1 %to &numstates.;

	/*Declare "thisstate" (picking a state from the list of possible states)*/
	%let thisstate=%scan(&states.,&i.);

	/*-------------------------------------------------------------------------------
	* Step 1: Get all sein's associated with the firmids on the DB employer file
	* using the the ECFT26 file. 						
	*-------------------------------------------------------------------------------*/

	/*Next, merge with the ecft26 by firmid pulling all the sein's associated with the firmid's in "thisstate"*/
	proc sql;
		create table dbecfs_&thisstate as 
		select a.firmid, a.group, a.treat_flag, b.sein, b.year, b.fas_firm_id
		from blue.abvcut_firms as a inner join
		ecft26&thisstate..ecf_&thisstate._t26 as b
		on a.firmid = b.fas_firm_id & a.year = b.year;
	quit;
	
	/*De-dup by group-sein-year (gets rid of seinunit and quarter duplications)*/
	proc sort data = dbecfs_&thisstate nodupkey;
		by group sein year;
	run;

	/*-------------------------------------------------------------------------------
	* Step 2: Using the sein's obtain all workers (pik's) who worked for DB sponsors
	* by merging with the EHF (which is a sein-pik panel)	
	*-------------------------------------------------------------------------------*/
	
	/*Merge with the EHF (gives all pik-years in "thisstate")*/
	proc sql;
		create table ee_link_&thisstate as 
		select a.*, b.pik
		from dbecfs_&thisstate as a inner join
		ehf&thisstate..ehf_&thisstate as b 
		on a.sein = b.sein & a.year = b.year;
	quit;
	
	/*List of unique pik's*/
	proc sort data = ee_link_&thisstate out = unq_piks_&thisstate (keep = pik firmid group year treat_flag) nodupkey;
		by group pik;
	run;
	
	/*-------------------------------------------------------------------------------
	* Step 3: Use the list of unique pik's to pull the full in-state history
	* for workers who ever were employed at DB firms in c-5 using ehf.							
	*-------------------------------------------------------------------------------*/
	
	/*Merge with the ehf to get the full IN-STATE (=thisstate) history for workers
	that ever worked for a DB sponsor in thisstate*/
	proc sql;
		create table ee_panel0_&thisstate.&i as
		select a.firmid, a.group, a.treat_flag, b.*	
		from unq_piks_&thisstate as a inner join
		ehf&thisstate..ehf_&thisstate as b
		on a.pik=b.pik;
	quit;

	/*Add SEIN level data from the ECF (first de-duplicate by sein and qtr)*/

	proc sort data = ecf&thisstate..ecf_&thisstate._sein out= ecf_&thisstate 
		(keep = sein year mode_es_sic_emp) nodupkey;
		by sein year;
	run;

	proc sql;
		create table ee_panel1_&thisstate.&i as
		select a.*, b.mode_es_sic_emp
		from ee_panel0_&thisstate.&i as a inner join
		ecf_&thisstate as b
		on a.sein = b.sein & a.year = b.year;
	quit;

	/*Pull T26 information from ECFT26 (first de-duplicate by sein and qtr)*/

	proc sort data = ecft26&thisstate..ecf_&thisstate._t26 out= ecft26_&thisstate 
		(keep = fas_ein fas_ein_match_lbd
		fas_firm_age fas_firm_id fas_firm_size sein year) nodupkey;
		by sein year;
	run;

	proc sql;
		create table ee_panel_&thisstate.&i (compress=yes) as 
		select a.*, b.fas_ein, b.fas_ein_match_lbd,
		b.fas_firm_age, b.fas_firm_id, b.fas_firm_size
		from ee_panel1_&thisstate.&i as a inner join /*make sure all in state obs are mapped*/
		ecft26_&thisstate as b 
		on a.sein = b.sein & a.year = b.year;
	quit;

	/*Clean up the big file to conserve space*/
	proc datasets lib=work nolist;
		delete ecft26_&thisstate;
	quit;
	run;
	
		/*Loop over the complement of states to get full OUT-OF-STATE histories for workers
		that ever worked for a DB sponsor in thisstate*/
		%do j=1 %to &numstates.;
		
			%let notthisst=%scan(&states.,&j.);
			%if &j ~= &i %then %do;

				proc sql;
					create table ee_panel0_&thisstate.&j as /*thisstate complement #j*/
					select a.firmid, a.group, a.treat_flag, b.*
					from unq_piks_&thisstate as a inner join	
					ehf&notthisst..ehf_&notthisst as b
					on a.pik = b.pik;
				quit;

				/*Add SEIN level data from the ECF (first de-duplicate by sein and qtr)*/

				proc sort data = ecf&notthisst..ecf_&notthisst._sein out= ecf_&notthisst 
					(keep = sein year mode_es_sic_emp) nodupkey;
				by sein year;
				run;
				
				proc sql;
					create table ee_panel1_&thisstate.&j as
					select a.*, b.mode_es_sic_emp
					from ee_panel0_&thisstate.&j as a inner join
					ecf_&notthisst as b
					on a.sein = b.sein & a.year = b.year;
				quit;

				/*Pull T26 information from ECFT26 (first de-duplicate by sein and qtr)*/
			
				proc sort data = ecft26&notthisst..ecf_&notthisst._t26 out= ecft26_&notthisst 
					(keep = fas_ein fas_ein_match_lbd
					fas_firm_age fas_firm_id fas_firm_size sein year) nodupkey;
					by sein year;
				run;
			
				proc sql;
					create table ee_panel_&thisstate.&j (compress=yes) as 
					select a.*, b.fas_ein, b.fas_ein_match_lbd,
					b.fas_firm_age, b.fas_firm_id, b.fas_firm_size
					from ee_panel1_&thisstate.&j as a inner join /*make sure all in state obs are mapped*/
					ecft26_&notthisst   as b 
					on a.sein = b.sein & a.year = b.year;
				quit;
				
				/*Clean up the big file to conserve space*/
				proc datasets lib=work nolist;
					delete ecft26_&notthisst;
				quit;
				run;

			%end;
		%end;	

	/*Stack the in-state and out-of-state histories*/
		%do k=1 %to &numstates.;
		
			proc append base = temp.ee_panel_&thisstate data = ee_panel_&thisstate.&k;
			run;
		%end;

	/*Clean up the working directory*/
	proc datasets lib=work nolist kill;
	quit;
	run;


	%end;
%mend;

%buildpanel;


/*-------------------------------------------------------------------------------
* Step 4: Stack the state-level files and de-duplicate
* There are duplicates because, for e.g., a worker selected from thisstate = WI 
* may also show up in a different spell in IL. These two spells will be repeated 
* when thisstate=IL
*-------------------------------------------------------------------------------*/

data temp.ee_panel0 (compress=yes);
	set temp.ee_panel_ak temp.ee_panel_ar temp.ee_panel_az
	     temp.ee_panel_ca temp.ee_panel_co temp.ee_panel_ct temp.ee_panel_dc
	     temp.ee_panel_de temp.ee_panel_fl temp.ee_panel_ga temp.ee_panel_hi 
	     temp.ee_panel_ia temp.ee_panel_id temp.ee_panel_il temp.ee_panel_in
	     temp.ee_panel_ks temp.ee_panel_ky temp.ee_panel_la temp.ee_panel_md
	     temp.ee_panel_me temp.ee_panel_mi temp.ee_panel_mn temp.ee_panel_mo
	     temp.ee_panel_ms temp.ee_panel_mt temp.ee_panel_nc temp.ee_panel_nd
	     temp.ee_panel_ne temp.ee_panel_nh temp.ee_panel_nj temp.ee_panel_nm
	     temp.ee_panel_nv temp.ee_panel_ny temp.ee_panel_oh temp.ee_panel_ok
	     temp.ee_panel_or temp.ee_panel_pa temp.ee_panel_ri temp.ee_panel_sc
	     temp.ee_panel_sd temp.ee_panel_tn temp.ee_panel_tx temp.ee_panel_ut
	     temp.ee_panel_va temp.ee_panel_vt temp.ee_panel_wa temp.ee_panel_wi
	     temp.ee_panel_wv temp.ee_panel_wy temp.ee_panel_ma;
	if year = 2015 then delete; /*drop the first qtr of 2015 which is in the s2014 LEHD*/
run;

/*Clean up base state level files*/
proc datasets lib=temp nolist;
	delete ee_panel_ak ee_panel_ar ee_panel_az
	     ee_panel_ca ee_panel_co ee_panel_ct ee_panel_dc
	     ee_panel_de ee_panel_fl ee_panel_ga ee_panel_hi 
	     ee_panel_ia ee_panel_id ee_panel_il ee_panel_in
	     ee_panel_ks ee_panel_ky ee_panel_la ee_panel_md
	     ee_panel_me ee_panel_mi ee_panel_mn ee_panel_mo
	     ee_panel_ms ee_panel_mt ee_panel_nc ee_panel_nd
	     ee_panel_ne ee_panel_nh ee_panel_nj ee_panel_nm
	     ee_panel_nv ee_panel_ny ee_panel_oh ee_panel_ok
	     ee_panel_or ee_panel_pa ee_panel_ri ee_panel_sc
	     ee_panel_sd ee_panel_tn ee_panel_tx ee_panel_ut
	     ee_panel_va ee_panel_vt ee_panel_wa ee_panel_wi
	     ee_panel_wv ee_panel_wy ee_panel_ma;
quit;
run;

proc sort data = temp.ee_panel0 nodupkey;
	by group pik sein seinunit year;
run;

/*Add info from ICF*/
proc sql;
	create table temp.ee_panel (compress=yes) as 
	select a.*,b.dob,b.pob,b.educ_c,b.ethnicity,b.race,b.sex,
	year(b.dob) as yob
	from temp.ee_panel0 as a inner join
	icf.icf_us as b 
	on a.pik = b.pik;
quit;

proc datasets lib=temp nolist;
	delete ee_panel0;
quit;
run;

/*-------------------------------------------------------------------------------
* Step 5: Use panel of all the workers that worked for a DB sponsor in c-5
* to make a balanced panel of one pik/year and calculate tenure								
*-------------------------------------------------------------------------------*/
	
/*First create a set of piks to build a balanced panel*/

data piks (keep=pik);
	set temp.ee_panel (keep=pik year);
run;

proc sort data = piks nodupkey;
	by pik;
run;

data temp.expand_pik (keep=pik year);
	set piks;
	do y = 1985 to 2014; 
		year = y;
		output;
	end;
run;

/*Calcuate tenure at each fas_firm_id and fas_ein*/
proc sort data = temp.ee_panel;
	by group pik fas_firm_id year;
run;

data temp.ee_panel;
	set temp.ee_panel;
	/*Tenure calc*/
	tenure_alpha + 1;
	by group pik fas_firm_id;
	if first.group or first.pik or first.fas_firm_id then tenure_alpha=1;
run;

/*Calcuate tenure at each fas_ein*/
proc sort data = temp.ee_panel;
	by group pik fas_ein year;
run;

data temp.ee_panel;
	set temp.ee_panel;
	/*Tenure calc*/
	tenure_ein + 1;
	by group pik fas_ein;
	if first.group or first.pik or first.fas_ein then tenure_ein=1;
run;

proc sort data = temp.ee_panel;
	by fas_firm_id year;
run;


/*Keep the highest paying job for each year*/
proc sort data=temp.ee_panel;
	by group pik year;
run;

proc means noprint data = temp.ee_panel;	
	by group pik year;
	var earn_ann;
	output out = maxearn
	max(earn_ann) = high_earn /*job with highest earnings*/
	sum(earn_ann) = total_earn; /*total earnings for this calendar year*/
run;
	
proc sql;
	create table temp.ee_panel1 as 
	select a.*, b.high_earn, b.total_earn
	from temp.ee_panel as a inner join
	maxearn as b
	on a.pik = b.pik & a.year = b.year;
quit;

proc datasets lib=temp nolist;
	delete ee_panel maxearn;
quit;
run;

proc datasets lib=work nolist;
	delete maxearn;
quit;
run;

data temp.ee_panel1;
	set temp.ee_panel1;
	where earn_ann = high_earn; /*keep highest paying job*/
run;

/*Drop a subset of workers for data quality reasons*/

proc sort data = temp.ee_panel1;
	by pik;
run;

data temp.ee_panel1;
	set temp.ee_panel1;
	flag=0;
	if year=group-5 & firmid~=fas_firm_id then flag=1;
run;

proc means noprint data = temp.ee_panel1;
	by pik;
	var flag;   
	output out = dbascn
	max(flag) = dbascn;
run;

proc sql;
	create table temp.ee_panel2 as 
	select a.*, b.dbascn
	from temp.ee_panel1 as a inner join
	dbascn as b
	on a.pik = b.pik;
quit;

proc datasets lib=temp nolist;
	delete ee_panel1 dbascn;
quit;
run;

proc datasets lib=work nolist;
	delete dbascn;
quit;
run;

proc freq data = temp.ee_panel2;
tables dbascn;
run;

data temp.ee_panel2 (drop=dbascn);
	set temp.ee_panel2;
	where dbascn=0;  /*drop workers without a surviving DB association*/
run;

/*Eliminate a small fraction of pik-year duplicates*/
proc sort data=temp.ee_panel2 nodupkey;
	by group pik year;
run;
	
/*Calculate tenure in terms of highest paying job (not necessarily equal to true tenure)
* This is the number of years that a particular job has been the highest paying job
* he_tenure is "high earnings tenure". Also calculate tenure in terms of continuous years
* of work which is designated cont_tenure.*/

proc sort data = temp.ee_panel2;
	by group pik fas_firm_id year;
run;

data temp.ee_panel2;
	set temp.ee_panel2;
	he_tenure_alpha + 1;
	cont_tenure_alpha + 1;
	lyear = lag1(year);
	by group pik fas_firm_id;
	if first.pik or first.fas_firm_id then he_tenure_alpha=1;
	if first.pik or first.fas_firm_id or lyear+1~=year then cont_tenure_alpha=1;

	label 
	he_tenure_alpha = "High earnings tenure (alpha)"
	cont_tenure_alpha = "Continuous years of work tenure (alpha)";
run;

proc sort data = temp.ee_panel2;
	by group pik fas_ein year;
run;

data temp.ee_panel2;
	set temp.ee_panel2;
	he_tenure_ein + 1;
	cont_tenure_ein + 1;
	lyear = lag1(year);
	by group pik fas_ein;
	if first.pik or first.fas_ein then he_tenure_ein=1;
	if first.pik or first.fas_ein or lyear+1~=year then cont_tenure_ein=1;

	label 
	he_tenure_ein = "High earnings tenure (EIN)"
	cont_tenure_ein = "Continuous years of work tenure (EIN)";
run;

/*-------------------------------------------------------------------------------
* Step 8: Extract workers who meet analysis criteria for each experiment 
* year. Age>=20 & Age<=70 in year, no tenure restriction in c-5						
*-------------------------------------------------------------------------------*/

/*Split the data set by group*/

%macro splitbygp(yy);

	data temp.ee_panel2_&yy;
		set temp.ee_panel2;
		where group=&yy;
	run;

	/*Merge (left join) the expanded pik list with the ee_panel to make a perfectly balanced panel*/ 
	proc sql;
		create table temp.ee_panel3_&yy as 
		select *
		from temp.expand_pik as a left join
		temp.ee_panel2_&yy as b
		on a.pik = b.pik & a.year = b.year;
	quit;


	proc datasets lib=temp nolist;
		delete ee_panel2_&yy;
	quit;
	run;


	data temp.ee_panel3_&yy (drop = flag lyear);
		set temp.ee_panel3_&yy;
		age_at_c = group-yob;
		source_at_cm5 =.;
		if year=group-5 then source_at_cm5 = input(source,2.);
		if year=group-5 then tenure_cm5 = cont_tenure_ein ;
	run;

	proc sort data = temp.ee_panel3_&yy;
		by pik;
	run;

	proc means noprint data = temp.ee_panel3_&yy;
		by pik;
		var yob age_at_c source_at_cm5 tenure_cm5 group treat_flag;
		output out=sample_crit
		max(yob) = mxyob
		max(age_at_c) = mxage_at_c
		max(source_at_cm5) = mxsource_at_cm5
		max(tenure_cm5) = max_tenure_cm5
		max(group) = mxgroup
		max(treat_flag) = mxtreat_flag;
	run;

	proc sql;
		create table temp.ee_panel4_&yy as
		select a.*, b.max_tenure_cm5, b.mxyob, b.mxage_at_c, b.mxgroup, b.mxtreat_flag, b.mxsource_at_cm5
		from temp.ee_panel3_&yy as a inner join
		sample_crit as b	
		on a.pik = b.pik;
	quit;

	
	proc datasets lib=temp nolist;
		delete ee_panel3_&yy;
	quit;
	run;


	proc datasets lib=work nolist;
		delete sample_crit;
	quit;
	run;

	/*Impose the restrictions*/
	data temp.ee_panel4_&yy (drop=tenure_cm5 age_at_c yob group treat_flag source_at_cm5
		rename= (mxyob=yob mxage_at_c=age_at_c max_tenure_cm5=tenure_cm5 mxsource_at_cm5 = source_at_cm5
                         mxgroup = group mxtreat_flag=treat_flag));
		set temp.ee_panel4_&yy;
		where mxage_at_c>=20 & mxage_at_c<=70;
		age = year-mxyob;
	run;

	/*Tack on firmid for pik-years that are force balanced*/
	data temp.firmids_&yy (keep = pik firmid);
	      set temp.ee_panel4_&yy;
	      where year=group-5;
	run;
      
	/*This is a check for whether the data are correctly constructed. No dups*/
	proc sort data = temp.firmids_&yy nodupkey;
	      by pik;
	run;

	proc sql;
		create table temp.ee_panel5_&yy as 
		select a.*, b.firmid as mxfirmid
		from temp.ee_panel4_&yy as a left join
		temp.firmids_&yy as b on
		a.pik = b.pik;
	quit;

	proc sort data = temp.ee_panel5_&yy (drop = firmid rename = (mxfirmid=firmid));
		by group pik year;
	run;

	proc datasets lib=temp nolist;
		delete firmids_&yy ee_panel4_&yy;
	quit;
	run;
	

%mend;

%splitbygp(2001);
%splitbygp(2002);
%splitbygp(2003);
%splitbygp(2004);
%splitbygp(2005);
%splitbygp(2006);
%splitbygp(2007);
%splitbygp(2008);
%splitbygp(2009);
%splitbygp(2010);
%splitbygp(2011);
%splitbygp(2012);
%splitbygp(2013);
%splitbygp(2014);

proc datasets lib=temp nolist;
	delete ee_panel2 expand_pik;
quit;
run;

/*Re-assemble the balanced group panels, save and export*/

data blue.ee_panel_fin;
	set temp.ee_panel5_2001;
run;

%macro append_gp;

%do k = 2002 %to 2014;
	
	proc append base = blue.ee_panel_fin (compress=yes) data = temp.ee_panel5_&k;
	run;
%end;

%mend;

%append_gp;

proc export data = blue.ee_panel_fin
	outfile = "/.../data/ee_panel_fin.dta"
	dbms = dta replace;
run;

/*Clean up temp directory*/
proc datasets lib=temp nolist kill;
quit;
run;
